Preparations

load data wrangling package

library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.2     ✔ dplyr   0.8.1
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

load data

dat_raw <- read_csv("data.csv")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   EZ = col_double(),
##   PLZ = col_double(),
##   ON = col_double(),
##   GstFl = col_double(),
##   Gebäudehöhe = col_double(),
##   Geschoße = col_double(),
##   Zähler = col_double(),
##   Nenner = col_double(),
##   BJ = col_double(),
##   Kaufpreis = col_double(),
##   mGfl = col_double(),
##   percentWidmung = col_double(),
##   aufEZ = col_double(),
##   one_more_time = col_logical(),
##   lon = col_double(),
##   lat = col_double(),
##   price_m2 = col_double(),
##   price_log = col_double(),
##   train = col_double(),
##   cluster = col_double()
## )
## See spec(...) for full column specifications.

take a look at data

(Note: data is from Vienna’s land charge register to which I have added price clusters)

dat_raw

quick visualization

## get map
library(ggmap)
vienna <- make_bbox(lon, lat, dat_raw) %>% 
  get_stamenmap(zoom = 13, maptype = c("toner-lite"))
ggmap(vienna, extent = "device") +
  coord_cartesian() +
  geom_point(data = dat_raw %>%
               filter(train==1),
             aes(x=lon, y=lat, color = as.factor(cluster)),
             alpha=.5) +
  geom_point(data = dat_raw %>%
               filter(train==0),
             aes(x=lon, y=lat),
             alpha=.5, shape=4) +
  theme(legend.position="none")

add id variable (= row number to data to identify individual cases)

dat_id <- dat_raw %>%
  mutate(id = row_number())

select columns we will work with (id, lon, lat, cluster)

dat_select <- dat_id %>%
  select(id, lon, lat, cluster)

separate training and test data

## test data has no assigned 'cluster'
dat_test <- dat_select %>% 
  filter(is.na(cluster)) %>%
  ## remove cluster, because it is missing anyway
  select(-cluster)

## get train data by anti join with original data
dat_train <- dat_select %>%
  anti_join(dat_test, by="id")

Get the nearest neighbors

cross training with test data (combine each case in training set with each case in the test set)

dat_expand <- dat_train %>% 
  crossing(dat_test)

take a quick glance

dat_expand

calculate euclidean distance (= square root of the sum of distance squares)

dat_dist <- dat_expand %>%
  mutate(dist = sqrt((lat-lat1)^2 + (lon-lon1)^2))

get the k (=5) nearest cases per test set case

dat_5nn <- dat_dist %>%
  ## segregate by id1
  group_by(id1) %>%
  ## get the 5 cases with smallest dist
  top_n(-5, dist)  %>%
  ## sort by id1
  arrange(id1)

get mayority vote

dat_classified <- dat_5nn  %>%
  ## segregate by id1 and cluster
  group_by(id1, cluster) %>%
  ## get counts per resulting group
  count() %>%
  ## reduce to highest count
  top_n(1, n) %>%
  ## remove 'n'
  select(-n)

add result to test data

dat_final <- dat_test %>%
  full_join(dat_classified, by=c("id"="id1"))

visually check result

ggmap(vienna, extent = "device") +
  coord_cartesian() +
  geom_point(data = bind_rows(dat_train, dat_final),
             aes(x=lon, y=lat, color = as.factor(cluster)),
             alpha=.5) +
  theme(legend.position="none")
## Coordinate system already present. Adding new coordinate system, which will replace the existing one.

for reproducability

sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] de_AT.UTF-8/de_AT.UTF-8/de_AT.UTF-8/C/de_AT.UTF-8/de_AT.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggmap_3.0.0     forcats_0.4.0   stringr_1.4.0   dplyr_0.8.1    
##  [5] purrr_0.3.2     readr_1.3.1     tidyr_0.8.3     tibble_2.1.2   
##  [9] ggplot2_3.1.1   tidyverse_1.2.1
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_0.2.5  xfun_0.7          haven_2.1.0      
##  [4] lattice_0.20-38   colorspace_1.4-1  generics_0.0.2   
##  [7] htmltools_0.3.6   yaml_2.2.0        rlang_0.3.4      
## [10] pillar_1.4.1      glue_1.3.1        withr_2.1.2      
## [13] modelr_0.1.4      readxl_1.3.1      jpeg_0.1-8       
## [16] plyr_1.8.4        munsell_0.5.0     gtable_0.3.0     
## [19] cellranger_1.1.0  rvest_0.3.4       RgoogleMaps_1.4.3
## [22] evaluate_0.14     labeling_0.3      knitr_1.23       
## [25] curl_3.3          broom_0.5.2       Rcpp_1.0.1       
## [28] scales_1.0.0      backports_1.1.4   jsonlite_1.6     
## [31] rjson_0.2.20      hms_0.4.2         png_0.1-7        
## [34] digest_0.6.19     stringi_1.4.3     grid_3.6.0       
## [37] cli_1.1.0         tools_3.6.0       bitops_1.0-6     
## [40] magrittr_1.5      lazyeval_0.2.2    crayon_1.3.4     
## [43] pkgconfig_2.0.2   xml2_1.2.0        lubridate_1.7.4  
## [46] assertthat_0.2.1  rmarkdown_1.13    httr_1.4.0       
## [49] rstudioapi_0.10   R6_2.4.0          nlme_3.1-139     
## [52] compiler_3.6.0